Code
library (tidyverse)
library (magrittr)
library (gt)
library (patchwork)
library (SummarizedExperiment)
library (tidySummarizedExperiment)
library (labelled)
library (vegan)
library (RColorBrewer)
# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs:: dir_map ("R scripts/" , source)
theme_set (theme_light ())
Loading the data
Code
data_source <- "real"
data_dir <- get_data_dir (data_source)
if (data_source == TRUE ) {
mg_dir <- str_c (data_dir, "03 metagenomics combined/" )
counts <- read_csv (str_c (mg_dir, "mg_combined.csv" ))
manifest <- read_csv (str_c (mg_dir, "mg_combined_manifest.csv" ))
} else {
mg_dir <- str_c (data_dir, "02 Metagenomics/" )
counts <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250404.csv" ))
counts_corr <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_taxaGLcor_20250404.csv" ))
relabs <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_RelAbund_20250404.csv" ))
technical_metadata <- read_csv (str_c (mg_dir, "VIBRANT_MG_technicalMetaData_20250404.csv" ))
LBP_strain_info <- readxl:: read_xlsx (str_c (data_dir, "00 Trial Data/IsolateNumbers.xlsx" ))
VIRGO2_taxonomic_table <- read_csv (str_c (mg_dir, "VIRGO2_taxonomy_key_250429.csv" ))
}
We load the following tables, all provided by Michael France:
counts raw counts per LBP strain and taxa (dimensions: 966 samples x 783 columns)
counts_corr counts per LBP strain and taxa corrected by gene length (dimensions: 966 samples x 782 columns)
relabs relative abundances per LBP strain and taxa, computed by M. France from counts_corr (dimensions: 966 samples x 782 columns)
technical_metadata technical metadata (dimensions: 1127 samples x 19 columns)
VIRGO2_taxonomic_table taxonomic table for the taxa as defined in VIRGO2 (dimensions: 779 taxa x 10 columns)
We note that some samples have been sequenced twice to increase the coverage. The technical metadata contains a column LibrarySequencedTwice that indicates whether a sample has been sequenced twice.
Code
sample_sequences_twice <-
technical_metadata |>
group_by (UID) |>
mutate (n = n ()) |>
ungroup () |>
arrange (- n) |>
filter (n > 1 )
There are 161 samples that have been sequenced twice.
Code
sample_sequences_twice |>
select (UID, SampleType, Ext_Lib_Plate, SequencingRun, BioinformaticsProcessingBatch, ` Selected4re-extraction ` , LibrarySequencedTwice, Notes) |>
gt (caption = "Samples that have been sequenced twice" )
Note: once we have participants/visits info, we can check whether there are any systematic patterns in these samples.
We note some negative (although extremely small) values in the counts, counts_corr, and rel_abs assays (all for native crispatus, I assume it being an artifact from kSANITY). We will set these to 0.
We also load LBP_strain_info from the VIBRANT Dropbox that contains information on the LBP strains
The names of the strains do not correspond exactly to those on the VIBRANT Dropbox folder. We modify them to match those provided by Michael.
Code
LBP_strain_info <-
LBP_strain_info |>
mutate (
strain_id = ` VMRC ID ` ,
LBP = ifelse (is.na (LC106), "LC-115" , "LC-106 & LC-115" ) |> factor (),
strain_origin = ` Geographic source ` |> factor ()
) |>
dplyr:: rename (Biose_ID = ` Biose ID ` ) |> #VMRC_ID = `VMRC ID`
dplyr:: select (strain_id, LBP, strain_origin, Biose_ID) |>
arrange (strain_origin, LBP) |>
mutate (strain_id = strain_id |> fct_inorder ()) |>
dplyr:: select (strain_id, LBP, strain_origin, contains ("ID" )) |>
mutate (
strain_id_mg = sub ("_.*" , "" , strain_id),
strain_id_mg =
case_when (
strain_id_mg == "CC0028A1" ~ "C0028A1" ,
TRUE ~ strain_id_mg
)
)
LBP_strain_info |>
set_variable_labels (
strain_id = "Strain ID" ,
strain_id_mg = "Strain ID (as in metagenomics data)" ,
strain_origin = "Strain origin" ,
Biose_ID = "Biose ID"
) |>
gt (caption = "LBP strain information" ) |>
tab_style (style = cell_text (weight = "bold" ),
locations = cells_column_labels ())
LBP strain information
Strain ID
LBP
Strain origin
Biose ID
Strain ID (as in metagenomics data)
FF00018
LC-106 & LC-115
SA
GA08
FF00018
FF00051
LC-106 & LC-115
SA
GA09
FF00051
UC101_2_33
LC-106 & LC-115
SA
GA12
UC101
FF00004
LC-115
SA
GA07
FF00004
FF00064
LC-115
SA
GA10
FF00064
FF00072
LC-115
SA
GA11
FF00072
UC119_2_PB0238
LC-115
SA
GA13
UC119
122010_1999_16
LC-115
SA
GA14
122010
185329_1999_17
LC-115
SA
GA15
185329
C0059E1
LC-106 & LC-115
US
GA03
C0059E1
C0022A1
LC-106 & LC-115
US
GA04
C0022A1
C0175A1
LC-106 & LC-115
US
GA06
C0175A1
C0006A1
LC-115
US
GA01
C0006A1
CC0028A1
LC-115
US
GA02
C0028A1
C0112A1
LC-115
US
GA05
C0112A1
Creating a SummarizedExperiment object from these tables
The SummarizedExperiment object will contain the following assays:
counts
counts_corr
rel_abs
The SE colData will contain the technical metadata, the total number of reads for each sample, along with the metagenomics-based CST, subCST, and VALENCIA’s assignment scores. For samples that have been sequenced twice, we will merge the sequencing and processing information from both sequencing runs.
The SE rowData will contain the VIRGO2 taxonomic table, augmented with the LBP strain information.
NOTE : At the moment, the table counts includes an extra column “multiGenera”. For now, we remove the MuliGenera from the counts data frame; but in the future, Michael should provide the corrected counts and relative abundances with this extra column.
Code
mg_to_SE <- function (counts, counts_corr, relabs, LBP_strain_info, technical_metadata, VIRGO2_taxonomic_table) {
technical_metadata_uid <-
technical_metadata |>
group_by (UID) |>
summarise (
across (everything (), ~ unique (.) |> na.omit () |> str_c (collapse = "; " )),
) |>
mutate (
LibrarySequencedTwice = ifelse (LibrarySequencedTwice == 1 , TRUE , FALSE ),
Ext_Lib_Plate = Ext_Lib_Plate |> as.integer ()
)
# remove "multiGenera" from counts # TODO : change later
counts <-
counts |>
select (- c (MultiGenera))
assay_counts <-
counts |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
drop_na () |>
t () |>
pmax (0 )
assay_counts_corr <-
counts_corr |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
t () |>
pmax (0 )
assay_relative_ab <-
relabs |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
t () |>
pmax (0 )
# include a total reads per sample in the colData
se_coldata <-
counts |>
select (sampleID, CST, subCST, score) |>
mutate (
total_non_human_reads = rowSums (counts |> select (- c (sampleID, CST, subCST, score)))
) |>
select (sampleID, total_non_human_reads, everything ()) |>
left_join (
technical_metadata_uid,
by = c ("sampleID" = "UID" )
) |>
as.data.frame ()
se_rowdata <-
tibble (
taxon_id =
counts |>
dplyr:: select (- c (sampleID, CST, subCST, score)) |>
colnames ()
) |>
left_join (
VIRGO2_taxonomic_table |>
mutate (
taxon_id = Taxa,
last_available_taxonomic_level =
Full_taxonomy |> str_remove_all (".*;" ) |> str_remove_all ("_.*" )
) |>
relocate (Full_taxonomy, .after = Species),
by = join_by (taxon_id)
) |>
left_join (
LBP_strain_info |>
dplyr:: rename (taxon_id = strain_id_mg),
by = join_by (taxon_id)
) |>
mutate (
taxon_label =
ifelse (! is.na (LBP), "LBP strain " , "" ) |>
str_c (taxon_id |> str_replace_all ("_" , " " )) |>
str_c (
ifelse (
last_available_taxonomic_level %in% c ("g" ,"f" ),
str_c (" (" , last_available_taxonomic_level, ")" ), ""
)
) |>
str_c (
ifelse (taxon_id == "Lactobacillus_crispatus" , " (native strains)" , "" )
)
) |>
relocate (taxon_label, .after = taxon_id) |>
as.data.frame ()
# Harmonization of the order of samples and feature
ordered_sample_ids <- se_coldata$ sampleID
assay_counts <- assay_counts[, ordered_sample_ids]
assay_counts_corr <- assay_counts_corr[, ordered_sample_ids]
assay_relative_ab <- assay_relative_ab[, ordered_sample_ids]
ordered_taxa <- se_rowdata$ taxon_id
assay_counts <- assay_counts[ordered_taxa, ]
assay_counts_corr <- assay_counts_corr[ordered_taxa, ]
assay_relative_ab <- assay_relative_ab[ordered_taxa, ]
SummarizedExperiment:: SummarizedExperiment (
assays = list (counts = assay_counts, counts_corr = assay_counts_corr, rel_abs = assay_relative_ab),
rowData = se_rowdata,
colData = se_coldata
)
}
Code
SE_mg <- mg_to_SE (counts, counts_corr, relabs, LBP_strain_info, technical_metadata, VIRGO2_taxonomic_table)
Exploratory & QC analyses
Code
# A SummarizedExperiment-tibble abstraction: 751,548 × 45
# Features=778 | Samples=966 | Assays=counts, counts_corr, rel_abs
.feature .sample counts counts_corr rel_abs sampleID total_non_human_reads
<chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
1 122010 MG_1010 0 0 0 MG_1010 9671181.
2 185329 MG_1010 0 0 0 MG_1010 9671181.
3 C0006A1 MG_1010 0 0 0 MG_1010 9671181.
4 C0022A1 MG_1010 1408234. 1494. 0.137 MG_1010 9671181.
5 C0028A1 MG_1010 0 0 0 MG_1010 9671181.
6 C0059E1 MG_1010 268235. 285. 0.0261 MG_1010 9671181.
7 C0112A1 MG_1010 0 0 0 MG_1010 9671181.
8 C0175A1 MG_1010 6571758. 6971. 0.639 MG_1010 9671181.
9 FF00004 MG_1010 0 0 0 MG_1010 9671181.
10 FF00018 MG_1010 0 0 0 MG_1010 9671181.
# ℹ 40 more rows
# ℹ 38 more variables: CST <chr>, subCST <chr>, score <dbl>, SampleType <chr>,
# Ext_Lib_Plate <int>, Ext_Lib_Plate_ID <chr>, Ext_Lib_Position <chr>,
# SequencingRun <chr>, DateSequenced <chr>, Lane <chr>, Sample <chr>,
# `Library Pool` <chr>, Library <chr>, FragmentSize <chr>, IndexSet <chr>,
# `Index 1` <chr>, `Index 2` <chr>, BioinformaticsProcessingBatch <chr>,
# `Selected4re-extraction` <chr>, LibrarySequencedTwice <lgl>, Notes <chr>, …
Total number of counts/relative abundances per sample
Total number of non-human reads per sample
Code
SE_mg |>
colData () |>
as_tibble () |>
ggplot () +
aes (x = total_non_human_reads, fill = LibrarySequencedTwice) +
geom_histogram (bins = 50 ) +
scale_x_log10 () +
facet_grid (SampleType ~ ., scale = "free" ) +
labs (
x = "Total number of non-human reads per sample" ,
y = "Number of samples"
) +
theme (
strip.text.y = element_text (angle = 0 )
)
Code
SE_mg |>
colData () |>
as_tibble () |>
arrange (total_non_human_reads) |>
mutate (index = row_number ()) |>
select (index, sampleID, total_non_human_reads, SampleType, LibrarySequencedTwice, subCST) |>
filter (total_non_human_reads < 1e6 ) |>
gt ()
index
sampleID
total_non_human_reads
SampleType
LibrarySequencedTwice
subCST
1
MG_EQ05822476
2
Nuclease-free water
FALSE
IV-A
2
MG_2179903
107164
ClinicalSample
FALSE
III-B
3
MG_EQ05822493
111049
Unused swab+C2
FALSE
III-B
4
MG_2694
122821
ClinicalSample
FALSE
III-B
5
MG_EQ05822499
122897
Nuclease-free water
FALSE
IV-B
6
MG_2196625
211151
ClinicalSample
FALSE
III-B
7
MG_2309505
284381
ClinicalSample
FALSE
III-B
8
MG_3968
303568
ClinicalSample
FALSE
IV-B
9
MG_2287336
380193
ClinicalSample
FALSE
IV-B
10
MG_2193899
425144
ClinicalSample
TRUE
III-B
11
MG_2250465
429267
ClinicalSample
FALSE
IV-C1
12
MG_3344
446679
ClinicalSample
FALSE
III-B
13
MG_5747
469552
ClinicalSample
FALSE
III-A
14
MG_2285838
528459
ClinicalSample
FALSE
III-B
15
MG_3458
548033
ClinicalSample
FALSE
III-B
16
MG_2311303
549857
ClinicalSample
FALSE
III-B
17
MG_2304216
580999
ClinicalSample
TRUE
IV-A
18
MG_1588
618230
ClinicalSample
FALSE
IV-B
19
MG_EQ05822501
635475
Nuclease-free water
FALSE
III-B
20
MG_2265171
722162
ClinicalSample
FALSE
I-B
21
MG_2289558
725940
ClinicalSample
FALSE
III-B
22
MG_2304663
733772
ClinicalSample
FALSE
IV-B
23
MG_972
767569
ClinicalSample
FALSE
III-A
24
MG_2192710
795144
ClinicalSample
TRUE
IV-C1
25
MG_2251190
805893
ClinicalSample
FALSE
IV-B
26
MG_4112
819924
ClinicalSample
TRUE
IV-B
27
MG_3146
822383
ClinicalSample
FALSE
III-B
28
MG_2289564
846815
ClinicalSample
FALSE
IV-B
29
MG_EQ05822500
857771
Unused swab+C2
FALSE
III-B
30
MG_2240695
866059
ClinicalSample
FALSE
IV-B
31
MG_1291
878598
ClinicalSample
FALSE
III-A
32
MG_3282
882023
ClinicalSample
FALSE
IV-B
33
MG_4825
925127
ClinicalSample
FALSE
IV-B
34
MG_2287920
930056
ClinicalSample
FALSE
IV-B
35
MG_1507
932868
ClinicalSample
FALSE
III-B
36
MG_2268611
958264
ClinicalSample
FALSE
III-B
37
MG_2242291
996799
ClinicalSample
TRUE
IV-B
TODO: check if correlated with post-MTZ visit
Total relative abundances per sample
We check that the relative abundances sum to 1 for each sample.
Code
tol <- 1e-5
SE_mg |>
as_tibble () |>
group_by (.sample) |>
summarise (total_rel_abs = sum (rel_abs)) |>
ggplot (aes (x = total_rel_abs)) +
geom_histogram (binwidth = tol) +
labs (x = "Total relative abundances per sample" ,
y = "Number of samples" ) +
scale_x_continuous (limits = 1 + c (- 1 , 1 ) * 5 * tol)
Code
Code
# We check the relative abundances computation (should be `counts_corr/sum(counts_corr)`).
tmp <-
SE_mg |>
as_tibble () |>
group_by (.sample) |>
mutate (rel_abs_check = counts_corr/ sum (counts_corr)) |>
ungroup ()
if (max (tmp$ rel_abs_check - tmp$ rel_abs) > 0.01 ) warning ("!! Relative abundances do not match the computed values !!" )
Non-bacterial DNA
There are some non-bacterial taxa in the dataset:
Code
SE_mg |>
as_tibble () |>
filter (Category != "Bacteria" ) |>
group_by (.sample, Category) |>
summarize (rel_abs = sum (rel_abs), .groups = "drop" ) |>
ggplot () +
aes (x = rel_abs, fill = Category) +
geom_histogram (binwidth = 0.001 ) +
facet_wrap (~ Category, scales = "free" ) +
scale_x_continuous ("Total relative abundance per sample for that group" , labels = scales:: percent_format ()) +
ylab ("Number of samples" ) +
guides (fill = "none" )
Code
SE_mg |>
as_tibble () |>
filter (Category != "Bacteria" ) |>
group_by (.sample) |>
mutate (total_rel_ab = sum (rel_abs)) |>
ungroup () |>
arrange (- total_rel_ab, - rel_abs) |>
mutate (
.sample = .sample |> fct_inorder (),
taxon_label = taxon_label |> fct_inorder ()
) |>
filter (as.numeric (.sample) <= 20 ) |>
ggplot () +
aes (x = rel_abs, y = .sample |> fct_rev (), fill = Category) +
geom_col () +
scale_x_continuous ("Relative abundance" , labels = scales:: percent_format ()) +
ylab ("Top 20 samples with most non-bacterial relative abundances" )
Code
SE_mg |>
as_tibble () |>
filter (Category != "Bacteria" ) |>
group_by (taxon_label) |>
mutate (total_rel_ab = sum (rel_abs), max_rel_ab = max (rel_abs)) |>
ungroup () |>
arrange (- total_rel_ab, - rel_abs) |>
mutate (
.sample = .sample |> fct_inorder (),
taxon_label = taxon_label |> fct_inorder ()
) |>
filter (as.numeric (taxon_label) <= 20 , max_rel_ab > 0.001 ) |>
ggplot () +
aes (x = rel_abs, y = taxon_label |> fct_rev (), color = Category) +
geom_point (alpha = 0.7 ) +
scale_x_continuous ("Relative abundance" , labels = scales:: percent_format ()) +
ylab ("Top non-bacterial organisms" ) +
theme (strip.text.y = element_text (angle = 0 )) +
labs (caption = "Each dot is a sample." )
Consequently, we create a new assay which contains the relative abundances of the bacteria only.
Code
SE_mg <-
SE_mg |>
mutate (
rel_abs_bact = rel_abs * (! is.na (Category) & (Category == "Bacteria" ))
)
assay (SE_mg, "rel_abs_bact" ) <- t (t (assay (SE_mg, "rel_abs_bact" ))/ colSums (assay (SE_mg, "rel_abs_bact" )))
Control samples
There are 4 categories of control samples:
Code
SE_mg |>
colData () |>
as_tibble () |>
filter (SampleType != "ClinicalSample" ) |>
select (SampleType, sampleID, Ext_Lib_Plate) |>
group_by (SampleType) |>
arrange (SampleType, Ext_Lib_Plate) |>
gt (caption = "Control samples" , row_group_as_column = TRUE )
Control samples
sampleID
Ext_Lib_Plate
Mock 1
MG_EQ05822515
1
MG_EQ05822481
5
MG_EQ05822509
8
MG_EQ05822503
10
MG_EQ05822445
11
Mock 2
MG_EQ05822227
1
MG_EQ05822473
3
MG_EQ05822497
10
MG_EQ05822439
11
Nuclease-free water
MG_EQ05822226
1
MG_EQ05822476
3
MG_EQ05822499
5
MG_EQ05822501
8
MG_EQ05822491
10
Unused swab+C2
MG_EQ05822202
1
MG_EQ05822493
5
MG_EQ05822500
8
MG_EQ05822485
10
These control samples have been created such that
Taxonomic composition of these control samples:
Code
get_taxa_colors <- function (taxa) {
tibble (
taxa = taxa
) |>
mutate (
cat =
case_when (
taxa == "Other" ~ "Other" ,
str_detect (taxa, "crispatus" ) ~ "crispatus" ,
str_detect (taxa, "LBP" ) ~ "LBP" ,
str_detect (taxa, "iner" ) ~ "iners" ,
str_detect (taxa, "revotella" ) ~ "prevotella" ,
str_detect (taxa, "ardnerella" ) ~ "gardnerella" ,
TRUE ~ "non lacto"
)
) |>
group_by (cat) |>
mutate (
color =
case_when (
taxa == "Other" ~ "gray80" ,
str_detect (taxa, "crispatus" ) ~ "orange" ,
str_detect (taxa, "LBP" ) ~ colorRampPalette (c ("orangered1" , "red4" ))(n ()),
str_detect (taxa, "iner" ) ~ "green3" ,
str_detect (taxa, "jensenii" ) ~ "green4" ,
str_detect (taxa, "ardnerella" ) ~ colorRampPalette (c ("dodgerblue1" , "dodgerblue4" ))(n ()),
str_detect (taxa, "revotella" ) ~ colorRampPalette (c ("purple1" , "purple4" ))(n ()),
TRUE ~ colorRampPalette (c ("turquoise3" , "black" ))(n ())
)
) |>
pull (color)
}
Code
control_types <- SE_mg$ SampleType |> unique () |> sort () |> extract (- 1 )
Code
map (
control_types,
~ {
tmp <-
SE_mg |>
as_tibble () |>
filter (SampleType == .x) |>
filter (rel_abs > 0 )
top_taxa <-
tmp |>
group_by (taxon_label) |>
summarise (total_relabs = sum (rel_abs)) |>
arrange (desc (total_relabs)) |>
slice_head (n = 25 ) |>
pull (taxon_label) |> sort ()
all_taxa <- tmp$ taxon_label |> unique () |> sort ()
tmp |>
ggplot () +
aes (y = rel_abs, x = .sample, fill = taxon_label) +
geom_col (color = "white" , linewidth = 0.1 ) +
scale_x_discrete ("Samples" ) +
scale_y_continuous ("Rel. ab." ) +
scale_fill_discrete (
"Top taxa" , breaks = top_taxa #, limits = all_taxa, values = get_taxa_colors(all_taxa)
) +
facet_grid (. ~ str_c ("Plate " , Ext_Lib_Plate), scales = "free" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 )) +
labs (title = str_c ("Control samples : " , .x))
}
)
And it looks like the replicability across controls (plates?) is not very good.
Mock 1
We see that the relative abundances are not very replicable, but let’s check if detection (presence/absence) is better.
Code
tmp <-
SE_mg |>
as_tibble () |>
filter (SampleType == "Mock 1" ) |>
mutate (present = (rel_abs > 1 / 1000 )) |>
group_by (taxon_label) |>
mutate (tot_prop = sum (rel_abs)) |>
ungroup () |>
arrange (- tot_prop) |>
mutate (taxon_label = taxon_label |> fct_inorder ())
Code
tmp |>
ggplot () +
aes (x = taxon_label, y = .sample, fill = rel_abs |> log10 ()) +
geom_tile () +
ylab ("" ) +
scale_x_discrete ("Taxa" , breaks = NULL ) +
scale_fill_gradient (low = "gray99" , high = "red" , na.value = "white" )
Mock 2
In the Mock 2, we check for the proportion of species/taxa that were not expected based on the theoretical composition of Mock 2
Code
# Focus on Mock2
tmp <-
SE_mg |>
as_tibble () |>
filter (SampleType == "Mock 2" ) |>
filter (rel_abs > 0 ) |>
mutate (
expected =
str_detect (taxon_label, "LBP" ) |
str_detect (taxon_label, "Gardnerella" ) |
str_detect (taxon_label, "Fannyhessea" ) |
str_detect (taxon_label, "bivia" ) |
str_detect (taxon_label, "crispatus" ) |
str_detect (taxon_label, "gasseri" ) |
str_detect (taxon_label, "jensenii" ) |
str_detect (taxon_label, "Streptococcus agalactiae" ) |
str_detect (taxon_label, "Finegoldia magna" )
)
tmp |>
ggplot () +
aes (y = rel_abs, x = .sample, fill = expected) +
geom_col (color = "white" , linewidth = 0.1 ) +
scale_x_discrete ("Samples" ) +
scale_y_continuous ("Rel. ab." ) +
facet_grid (. ~ str_c ("Plate " , Ext_Lib_Plate), scales = "free" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
Code
tmp |>
group_by (.sample, Ext_Lib_Plate, expected) |>
summarize (tot_rel_ab_pc = round (100 * sum (rel_abs), 2 ), .groups = "drop" ) |>
filter (! expected) |>
gt (caption = "Percent of relative abundance with unexpected taxa" )
Percent of relative abundance with unexpected taxa
.sample
Ext_Lib_Plate
expected
tot_rel_ab_pc
MG_EQ05822227
1
FALSE
4.53
MG_EQ05822439
11
FALSE
3.53
MG_EQ05822473
3
FALSE
1.83
MG_EQ05822497
10
FALSE
3.13
When only considering bacterial relative abundances, results look similar:
Code
# Focus on Mock2
tmp <-
SE_mg |>
as_tibble () |>
filter (SampleType == "Mock 2" ) |>
filter (rel_abs_bact > 0 ) |>
mutate (
expected =
str_detect (taxon_label, "LBP" ) |
str_detect (taxon_label, "Gardnerella" ) |
str_detect (taxon_label, "Fannyhessea" ) |
str_detect (taxon_label, "bivia" ) |
str_detect (taxon_label, "crispatus" ) |
str_detect (taxon_label, "gasseri" ) |
str_detect (taxon_label, "jensenii" ) |
str_detect (taxon_label, "Streptococcus agalactiae" ) |
str_detect (taxon_label, "Finegoldia magna" )
)
tmp |>
ggplot () +
aes (y = rel_abs_bact, x = .sample, fill = expected) +
geom_col (color = "white" , linewidth = 0.1 ) +
scale_x_discrete ("Samples" ) +
scale_y_continuous ("Rel. ab. (of bacterial content)" ) +
facet_grid (. ~ str_c ("Plate " , Ext_Lib_Plate), scales = "free" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
Code
tmp |>
group_by (.sample, Ext_Lib_Plate, expected) |>
summarize (tot_rel_ab_pc = round (100 * sum (rel_abs_bact), 2 ), .groups = "drop" ) |>
filter (! expected) |>
gt (caption = "Percent of relative abundance with unexpected taxa" )
Percent of relative abundance with unexpected taxa
.sample
Ext_Lib_Plate
expected
tot_rel_ab_pc
MG_EQ05822227
1
FALSE
4.52
MG_EQ05822439
11
FALSE
3.50
MG_EQ05822473
3
FALSE
1.82
MG_EQ05822497
10
FALSE
3.13
It could be annoying that we are so close to the detection threshold for the primary outcomes definition.
Alpha-diversity of clinical samples and controls
Code
vegan:: diversity (SE_mg |> assay ("rel_abs" ), index = "shannon" , MARGIN = 2 ) |>
as_tibble () |>
mutate (sampleID = colnames (SE_mg)) |>
left_join (SE_mg |> colData () |> as_tibble (), by = join_by (sampleID)) |>
ggplot () +
aes (x = SampleType, y = value |> exp (), col = SampleType, fill = SampleType) +
geom_violin (scale = "width" , color = NA , alpha = 0.2 ) +
ggbeeswarm:: geom_quasirandom (size = 0.5 , alpha = 0.5 ) +
scale_y_log10 () +
labs (
x = "" ,
y = "Effective number of species based on shannon diversity index"
) +
guides (fill = "none" , color = "none" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
Negative controls have lower total reads and lower alpha-diversity.
Proportion of LBP strains per sample
Do we detect any LBP strain?
Code
SE_mg |>
as_tibble () |>
filter (! is.na (LBP)) |>
ggplot () +
aes (x = rel_abs, fill = (rel_abs == 0 )) +
geom_histogram (binwidth = 0.01 ) +
scale_fill_manual ("" , values = c ("steelblue" , "gray" ), labels = c ("Rel. ab = 0" , "Rel. ab > 0" )) +
xlab ("Relative abundance of LBP strains" ) +
ylab ("Number of samples x LBP strain" )
Code
SE_mg |>
as_tibble () |>
filter (! is.na (LBP), rel_abs > 0 ) |>
ggplot () +
aes (x = rel_abs) +
geom_histogram (binwidth = 0.001 ) +
xlab ("Non-zero relative abundance of LBP strains" ) +
ylab ("Number of samples" ) +
facet_grid (LBP + .feature ~ .) +
theme (
strip.text.y = element_text (angle = 0 )
)
Code
SE_mg |>
as_tibble () |>
filter (! is.na (LBP), rel_abs > 0 ) |>
ggplot () +
aes (x = .feature) +
geom_bar () +
scale_x_discrete ("LBP strains" ) +
scale_y_continuous ("Number of samples with non-zero relative abundances" ) +
facet_grid (. ~ LBP, scales = "free" , space = "free" ) +
theme (
axis.text.x = element_text (angle = 45 , hjust = 1 )
)
Code
number_of_non_zero_LBP_strains_per_sample <-
SE_mg |>
as_tibble () |>
filter (! is.na (LBP)) |>
group_by (.sample) |>
summarize (
number_of_non_zero_LBP_strains = sum (rel_abs > 0 ),
)
number_of_non_zero_LBP_strains_per_sample |>
ggplot () +
aes (x = number_of_non_zero_LBP_strains) +
geom_histogram (binwidth = 0.5 ) +
scale_x_continuous ("Number of non-zero LBP strains per sample" , breaks = 0 : 15 , minor_breaks = 0 ) +
ylab ("Number of samples" )
There are 234 samples (24.22%) with at least one LBP strain detected (non-zero relative abundance).
Code
SE_mg |>
as_tibble () |>
filter (! is.na (LBP)) |>
left_join (
number_of_non_zero_LBP_strains_per_sample,
by = join_by (.sample)
) |>
filter (number_of_non_zero_LBP_strains != 0 ) |>
group_by (.feature) |>
mutate (tot_rel_abs_for_strain = sum (rel_abs)) |>
ungroup () |>
arrange (LBP, - tot_rel_abs_for_strain) |>
mutate (.feature = .feature |> fct_inorder ()) |>
group_by (.sample) |>
mutate (
LBP_rel_abs = rel_abs / sum (rel_abs),
score = weighted.mean (LBP_rel_abs, .feature |> as.numeric ()),
total_LBP_rel_abs = sum (rel_abs)
) |>
ungroup () |>
arrange (score) |>
mutate (.sample = .sample |> fct_inorder ()) |>
ggplot () +
aes (x = .feature, y = .sample, fill = rel_abs) +
geom_tile () +
scale_fill_continuous ("Rel. ab" , low = "white" , high = "steelblue2" ) +
scale_x_discrete ("Strains" ) +
scale_y_discrete ("Samples" , breaks = NULL ) +
facet_grid (. ~ LBP + strain_origin, scales = "free_x" , space = "free" ) +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 )) +
labs (
caption = "Rel. ab. of LBP strains in samples with at least one LBP strain detected (non-zero)" ,
)
Code
samples_with_only_one_LBP_strain <-
number_of_non_zero_LBP_strains_per_sample |>
filter (number_of_non_zero_LBP_strains == 1 ) |>
pull (.sample)
SE_mg |>
as_tibble () |>
filter (! is.na (LBP)) |>
left_join (number_of_non_zero_LBP_strains_per_sample, by = ".sample" ) |>
filter (number_of_non_zero_LBP_strains != 0 ) |>
ggplot () +
aes (x = rel_abs, fill = number_of_non_zero_LBP_strains |> factor ()) +
geom_histogram () +
facet_grid (number_of_non_zero_LBP_strains ~ taxon_label) +
theme (strip.text.y = element_text (angle = 0 ))
Compositions
Code
summarized_rel_abs <-
SE_mg |>
as_tibble () |>
group_by (.feature) |>
mutate (max_rel_ab = max (rel_abs), mean_rel_ab = mean (rel_abs)) |>
ungroup () |>
mutate (
taxa =
case_when (
! is.na (LBP) ~ taxon_label,
max_rel_ab > 1 / 3 ~ taxon_label,
TRUE ~ "Other"
),
is_lacto = str_detect (Genus, "Lactobacillus" ) & (taxa != "Other" )
) |>
arrange (LBP, is_lacto, desc (max_rel_ab)) |>
mutate (
taxa = taxa |> fct_inorder ()
) |>
group_by (.sample, SampleType, taxa, LBP, is_lacto) |>
summarize (rel_abs = sum (rel_abs), .groups = "drop" ) |>
arrange (.sample, - rel_abs) |>
group_by (.sample) |>
mutate (
score = weighted.mean (taxa |> as.numeric (), rel_abs),
tot = sum (rel_abs),
dom = taxa[1 ]
) |>
ungroup () |>
arrange (score) |>
mutate (.sample = .sample |> fct_inorder ())
Code
summarized_rel_abs |>
ggplot () +
aes (x = .sample, y = rel_abs, fill = taxa) +
geom_col () +
scale_x_discrete ("Samples" , breaks = NULL ) +
facet_grid (. ~ SampleType, scales = "free" , space = "free" ) +
scale_fill_manual (
"" ,
values = get_taxa_colors (summarized_rel_abs$ taxa |> levels ()),
guide = guide_legend (nrow = 5 )
) +
theme (
strip.text.x = element_text (angle = 90 , hjust = 0 ),
legend.position = "bottom" ,
)
PCoA on counts
Code
dist_matrix <-
vegdist (
assay (SE_mg, "counts" ) |> t (),
method = "bray" )
pcoa <- wcmdscale (dist_matrix, eig = TRUE )
print (pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)
Inertia Rank
Total 359.39
Real 428.63 344
Imaginary -69.24 621
Results have 966 points, 344 axes
Eigenvalues:
[1] 60.97 50.39 33.62 21.71 15.98 13.65 12.50 10.89 9.51 8.84 8.08 6.79
[13] 6.25 6.08 5.51 4.77 4.59 4.39 4.38 4.21 4.07 3.73 3.61 3.32
[25] 3.15 3.07 2.94 2.81 2.69 2.58 2.46 2.45 2.32 2.27 2.20 2.10
[37] 2.04 1.96 1.87 1.80 1.71 1.62 1.56 1.54 1.52 1.44 1.40 1.38
[49] 1.32 1.29 1.24 1.21 1.18 1.17 1.12 1.10 1.08 1.07 1.05 1.03
[61] 1.00 0.97 0.95 0.93 0.89 0.88 0.87 0.85 0.82 0.80 0.79 0.77
[73] 0.75 0.74 0.72 0.72 0.71 0.68 0.67 0.66 0.64 0.63 0.62 0.61
[85] 0.60 0.59 0.58 0.57 0.55 0.54 0.53 0.52 0.51 0.51 0.50 0.49
[97] 0.48 0.48 0.47 0.46 0.45 0.45 0.44 0.43 0.42 0.42 0.41 0.41
[109] 0.40 0.40 0.39 0.39 0.38 0.38 0.37 0.36 0.36 0.35 0.34 0.34
(Showing 120 of 965 eigenvalues)
Weights: Constant
Code
pcoa$ eig |>
as.data.frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = str_remove (axis, "Eigenvalue" ),
var_explained = 100 * pcoa$ eig/ sum (pcoa$ eig)) |>
slice_head (n = 10 ) |>
gt () |>
tab_header (title = "Eigenvalues of the PCoA" ) |>
cols_label (axis = "Axis" , ` pcoa$eig ` = "Eigenvalue" , var_explained = "Variance explained" ) |>
fmt_number (columns = vars (` pcoa$eig ` , var_explained), decimals = 3 )
Eigenvalues of the PCoA
Axis
Eigenvalue
Variance explained
1
60.975
16.966
2
50.394
14.022
3
33.616
9.353
4
21.708
6.040
5
15.980
4.446
6
13.648
3.798
7
12.496
3.477
8
10.893
3.031
9
9.509
2.646
10
8.836
2.459
Code
pcoa$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x = "Axis" ,
y = "Eigenvalue"
)
Code
pcoa$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
filter (axis <= 20 ) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x = "Axis" ,
y = "Eigenvalue"
)
Code
pcoa_data <-
tibble (
sampleID = rownames (pcoa$ points),
PCoA = pcoa$ points
) |>
left_join (
SE_mg |> colData () |> as_tibble (),
by = join_by (sampleID)
)
Code
pcoa_plot <- function (pcoa_tb, pcoa, color_var, axes = 1 : 2 ){
pcoa_tb <-
pcoa_tb |>
mutate (
col = as.factor (!! sym (color_var)),
x = PCoA[, axes[1 ]],
y = PCoA[, axes[2 ]]
)
pcoa_tb |>
ggplot () +
aes (x = x, y = y, color = col) +
geom_point (size = 2 , alpha = 0.7 ) +
coord_fixed () +
labs (
x = paste0 ("PCoA " , axes[1 ], " (" , round (100 * pcoa$ eig[axes[1 ]] / sum (pcoa$ eig), 1 ), "%)" ),
y = paste0 ("PCoA " , axes[2 ], " (" , round (100 * pcoa$ eig[axes[2 ]] / sum (pcoa$ eig), 1 ), "%)" ),
color = color_var
)
}
Type of sample
Code
sample_type_colors <- c ("gray" , "dodgerblue" , "orange" , "black" , "red" )
pcoa_plot (pcoa_data, pcoa, "SampleType" , c (1 , 2 )) + scale_color_manual (values = sample_type_colors)
Code
pcoa_plot (pcoa_data, pcoa, "SampleType" , c (3 , 4 )) + scale_color_manual (values = sample_type_colors)
Code
manova <- adonis2 (dist_matrix ~ pcoa_data$ SampleType)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ pcoa_data$SampleType)
Df SumOfSqs R2 F Pr(>F)
Model 4 4.42 0.01229 2.989 0.001 ***
Residual 961 354.97 0.98771
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Library Pool
Code
pcoa_plot (pcoa_data, pcoa, "Library.Pool" , c (1 , 2 ))
Code
pcoa_plot (pcoa_data, pcoa, "Library.Pool" , c (3 , 4 ))
Lane
Code
pcoa_plot (pcoa_data |> filter (Lane %in% 1 : 4 ), pcoa, "Lane" , c (1 , 2 ))
Code
pcoa_plot (pcoa_data |> filter (Lane %in% 1 : 4 ), pcoa, "Lane" , c (3 , 4 ))
PCoA on relative abundances
Code
dist_matrix_relab <-
vegdist (
relabs |> dplyr:: select (- c (CST, subCST, score)) |> column_to_rownames ("sampleID" ),
method = "bray" )
pcoa_relab <- wcmdscale (dist_matrix_relab, eig = TRUE )
print (pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)
Inertia Rank
Total 309.98
Real 381.84 326
Imaginary -71.86 639
Results have 966 points, 326 axes
Eigenvalues:
[1] 71.59 59.11 39.07 19.66 15.30 12.04 10.41 8.62 7.37 6.53 5.77 5.39
[13] 5.24 4.78 4.31 4.24 3.83 3.75 3.24 3.15 2.98 2.79 2.68 2.57
[25] 2.24 2.16 2.08 1.94 1.77 1.73 1.59 1.54 1.53 1.41 1.36 1.35
[37] 1.28 1.25 1.18 1.16 1.12 1.09 1.05 1.03 0.98 0.96 0.93 0.91
[49] 0.88 0.87 0.82 0.80 0.79 0.78 0.76 0.75 0.74 0.70 0.69 0.67
[61] 0.66 0.64 0.63 0.62 0.59 0.56 0.56 0.55 0.54 0.52 0.51 0.49
[73] 0.49 0.49 0.48 0.47 0.46 0.45 0.44 0.43 0.42 0.41 0.41 0.40
[85] 0.39 0.38 0.38 0.37 0.36 0.36 0.35 0.35 0.34 0.33 0.33 0.32
[97] 0.31 0.30 0.30 0.29 0.29 0.28 0.28 0.27 0.26 0.26 0.26 0.26
[109] 0.25 0.24 0.24 0.24 0.24 0.23 0.23 0.23 0.22 0.22 0.21 0.21
(Showing 120 of 965 eigenvalues)
Weights: Constant
Code
pcoa_relab$ eig |>
as.data.frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = str_remove (axis, "Eigenvalue" ),
var_explained = 100 * pcoa$ eig/ sum (pcoa_relab$ eig)) |>
slice_head (n = 10 ) |>
gt () |>
tab_header (title = "Eigenvalues of the PCoA" ) |>
cols_label (axis = "Axis" , ` pcoa_relab$eig ` = "Eigenvalue" , var_explained = "Variance explained" ) |>
fmt_number (columns = vars (` pcoa_relab$eig ` , var_explained), decimals = 3 )
Eigenvalues of the PCoA
Axis
Eigenvalue
Variance explained
1
71.593
19.670
2
59.108
16.257
3
39.074
10.844
4
19.664
7.003
5
15.305
5.155
6
12.043
4.403
7
10.414
4.031
8
8.623
3.514
9
7.367
3.068
10
6.533
2.850
Code
pcoa_relab$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x= "Axis" ,
y = "Eigenvalue"
)
Code
pcoa_relab$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
filter (axis <= 20 ) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x= "Axis" ,
y = "Eigenvalue"
)
Code
pcoa_relab_data <-
tibble (
sampleID = rownames (pcoa_relab$ points),
PCoA = pcoa_relab$ points
) |>
left_join (
SE_mg |> colData () |> as_tibble (),
by = join_by (sampleID)
)
Type of sample
Code
sample_type_colors <- c ("gray" , "dodgerblue" , "orange" , "black" , "red" )
pcoa_plot (pcoa_relab_data, pcoa_relab, "SampleType" , c (1 , 2 )) + scale_color_manual (values = sample_type_colors)
Code
pcoa_plot (pcoa_relab_data, pcoa_relab, "SampleType" , c (3 , 4 )) + scale_color_manual (values = sample_type_colors)
Code
manova <- adonis2 (dist_matrix_relab ~ pcoa_relab_data$ SampleType)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$SampleType)
Df SumOfSqs R2 F Pr(>F)
Model 4 3.536 0.01141 2.7725 0.001 ***
Residual 961 306.448 0.98859
Total 965 309.984 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Library Pool
Code
pcoa_plot (pcoa_relab_data, pcoa_relab, "Library.Pool" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab_data, pcoa_relab, "Library.Pool" , c (3 , 4 ))
Lane
Code
pcoa_plot (pcoa_relab_data |> filter (Lane %in% 1 : 4 ), pcoa_relab, "Lane" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab_data |> filter (Lane %in% 1 : 4 ), pcoa_relab, "Lane" , c (3 , 4 ))
Saving SummarizedExperiment objects
We save the SE objects to disk
Code
saveRDS (
SE_mg,
str_c (
get_output_dir (data_source = data_source),
"01_se_mg_" , today () |> str_remove_all ("-" ), ".rds"
)
)